library(tm)
library(plyr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(readr)
library(stringr)
library(scales)
library(tidytext)
library(textdata)
library(ngram)
library(forcats)
library(syuzhet)
library(wordcloud)
library(reshape2)
library(topicmodels)
library(DT)
library(gridExtra)
# This data is processed from Text_Pre-processing file
hm_data <- read_csv("../data/output/processed_moments.csv")
urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/demographic.csv'
demo_data <- read_csv(urlfile)
names(hm_data)
[1] "hmid" "wid" "reflection_period" "original_hm"
[5] "cleaned_hm" "modified" "num_sentence" "ground_truth_category"
[9] "predicted_category" "id" "text"
## Select the data
happydb <- hm_data %>%
inner_join(demo_data, by = "wid") %>%
select(gender,
reflection_period,
text)
# Merge the text data with demographic data into a CSV file
# write_csv(happydb, "../data/output/happydb.csv")
happydb = read.csv("../data/output/happydb.csv")
In order to analyze the components of the text in each group, the barplot and wordcloud are used. #### Step 1 : Filter data
female <- happydb %>%
filter(gender == 'f')
male <- happydb %>%
filter(gender == 'm')
urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/senselabel.csv'
sense_data <- read.csv(urlfile, stringsAsFactors = F)
data("stop_words")
word <- c("happy","ago","yesterday","lot","today","months","month",
"happier","happiest","last","week","past","day")
stop_words <- stop_words %>%
bind_rows(mutate(tibble(word), lexicon = "updated"))
# females' answer word cleaning
female_text <- female$text %>% as.vector() %>%
tibble(text = .) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
Joining, by = "word"
# males' answer word cleaning
male_text <- male$text %>% as.vector() %>%
tibble(text = .) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
Joining, by = "word"
freq_f <- female_text %>% count(word, sort = TRUE) # count frequency and sort by descending order
freq_m <- male_text %>% count(word, sort = TRUE) # count frequency and sort by descending order
# visualize the frequency
freq_f %>%
filter(n > 1500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill = "pink") +
xlab(NULL) +
ggtitle("Female word frequency")+
coord_flip()
freq_m %>%
filter(n > 1500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col(fill= "skyblue") +
xlab(NULL) +
ggtitle("Male word frequency")+
coord_flip()
# join each gender's word frequency
frequency <- bind_rows(mutate(freq_f, gender = "Female"),
mutate(freq_m, gender = "Male")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(gender, word) %>%
group_by(gender) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>% # delete n var
spread(gender, proportion) %>% # word female male as columns
gather(gender, proportion, `Male`) %>% drop_na()
ggplot(frequency, aes(x = proportion, y = `Female`, color = abs(`Female` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
facet_wrap(~gender, ncol = 2) +
theme(legend.position="none") +
labs(y = "Female", x = NULL)
Words that are close to the line in these plots have similar frequencies in both sets of texts, which means there is huge similarity between females’ response and males’ response
# combine with position
pos <- sense_data %>%
select(lowercaseLemma, POS) %>%
rename(word = lowercaseLemma) %>%
unique()
pos_f <- freq_f %>% inner_join(pos) %>%
group_by(POS) %>%
summarise(n = sum(n)) %>%
mutate(proportion = n /sum(n))
Joining, by = "word"
pos_m <- freq_m %>% inner_join(pos) %>%
group_by(POS) %>%
summarise(n= sum(n)) %>%
mutate(proportion = n /sum(n))
Joining, by = "word"
pos_f %>%
mutate(POS = reorder(POS, proportion)) %>%
ggplot(aes(POS, proportion)) +
geom_col(fill = "pink") +
xlab(NULL) +
ggtitle("Female Part of Speech Proportion")+
coord_flip()
pos_m %>%
mutate(POS = reorder(POS, proportion)) %>%
ggplot(aes(POS, proportion)) +
geom_col(fill = "skyblue") +
xlab(NULL) +
ggtitle("Male Part of Speech Proportion")+
coord_flip()
### General Word Cloud
# palatte
colorVec_f = rep(c('red', 'lightpink'), length.out=nrow(freq_f))
freq_f %>% with(wordcloud(word, n, max.words = 100, colors =colorVec_f))
# palatte
colorVec_m = rep(c('blue', 'skyblue'), length.out=nrow(freq_m))
freq_m %>% with(wordcloud(word, n, max.words = 100, colors =colorVec_m))
The barplots show that both male and female like use nouns, verbs and adjs to express the happy moments. The top 10 words of frequency in each groups also support the notation. Women have 8 nouns and 2 verbs and man have 7 nouns, 2 verbs and 1 adverb(finally).
In the barplot and wordcloud graphs of both female and male, we can find that both women and men are enjoyed the happy moments with their friends. But women tend to be more bond with people around them for the words like husband, daughter, son, family come up more frequently than men’s.
For the infomation of the difference in the numbers and contents of periods in each gender group, I’m gonna use the number count, word cloud and ks-test.
ggplot(happydb,aes(x = happydb$gender,fill = happydb$reflection_period))+
geom_bar(position = "fill") +
labs(title = 'Reflection Period of Different Gender', x = 'Gender')
fre
Error: object 'fre' not found
word_f <- bind_rows(mutate(freq_f24h, time = "24h"),
mutate(freq_f3m, time = "3m"))
word_m <- bind_rows(mutate(freq_m24h, time = "24h"),
mutate(freq_m3m, time = "3m"))
word_f %>%
acast(word ~ time, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("grey20", "pink"),
max.words = 100)
word_m %>%
acast(word ~ time, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("grey20", "skyblue"),
max.words = 100)
ks.test(pos_f24h$proportion, pos_f3m$proportion)
ks.test(pos_m24h$proportion, pos_m3m$proportion)
There is no difference between the part of speech chosen in the 3 month or 24 hour within each gender group.
h24 <- bind_rows(mutate(pos_f24h, gender = "female"),
mutate(pos_m24h, gender = "male"))
m3 <- bind_rows(mutate(pos_f3m, gender = "female"),
mutate(pos_m3m, gender = "male"))
h24 %>%
group_by(gender) %>%
ungroup() %>%
mutate(POS = reorder(POS, n)) %>%
ggplot(aes(POS, n, fill = gender)) +
geom_col(show.legend = FALSE) +
facet_wrap(~gender,scales = "free_y") +
labs(y = "Contribution to Part Of Speech",
x = NULL, title = "24 hour") +
coord_flip()
m3 %>%
group_by(gender) %>%
ungroup() %>%
mutate(POS = reorder(POS, n)) %>%
ggplot(aes(POS, n, fill = gender)) +
geom_col(show.legend = FALSE) +
facet_wrap(~gender,scales = "free_y") +
labs(y = "Contribution to Part Of Speech",
x = NULL , title = "3 month") +
coord_flip()
#use the ks.test
ks.test(pos_f24h$proportion,pos_m24h$proportion);ks.test(pos_f3m$proportion,pos_m3m$proportion)
#### Step 5. Topic Modeling
For both female and male, there is no difference in the total number of the reflection period. But with the content of each period, female and male show differently.
Since the p-values of Kolmogorov-Smirnov test are very close to 1, the null hypothesis cannot be rejected at 0.01 significance which means there is no statistically difference between the POS of 24 hours and that of 3 months in each gender group.
After removing the most frequently used words in each group, the results show that in 24-hour’s memory, women have more ‘fleeting’ words - words describing movement, such as watched, feel, enjoy, ect. In 3-month’s memory, women have more nouns describing the person they shared the happy moments with. This is compatible with the memory loss. But focusing on men’s words in different reflection period, the POS of the words seem remain.
# obtained sentiment
bing = get_sentiments("bing")
female_sentiment <- female_text %>% inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
male_sentiment <- male_text %>% inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
female_sentiment %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
male_sentiment %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
The top 10 words in positive and negative aspect don’t differ from each gender, just the order changed.
female_text %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("grey20", "pink"),
max.words = 100)
male_text %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "skyblue"),
max.words = 100)
Here we use the nrc sentiment
nrc = get_sentiments("nrc")
nrc_f <- female_text %>%
inner_join(nrc) %>%
count(word,sentiment) %>%
spread(sentiment, n, fill = 0) %>%
select(-word) %>%
as.matrix() %>%
apply(2,sum)
nrc_m <- male_text %>%
inner_join(nrc) %>%
count(word,sentiment) %>%
spread(sentiment, n, fill = 0) %>%
select(-word) %>%
as.matrix() %>%
apply(2,sum)
#ks.test of sentiment values of each group
ks.test(nrc_f, nrc_m)
Both female and male use more positive words than negative words to express their happy moments. For the sense of words, the words which show joy and anticipation are more likely used for happy moments. The histogram shows both mean of the sentiment values are concerntrated on 0. Under the ks.test, there is no significantly difference between the sense used by women and that by man.
From the analysis above, we can firstly know that the happy moments are mostly postive which is confirmed with the name of the research ‘Happy Moments’. Both women and men used nouns, verbs and adjectives to express their happy moments. Besides, women like to remember the people who spent the happy moments with them but men don’t show the tendency.
There is no statistically difference between the numbers of happy moments of different period in each gender group.But the content of the word used by females and males show difference. Women tends to use verbs or adjectives to describe their happy moment in short-term memories, more nouns for discription in longer-term memories while men don’t show the same tendency.